# hTreg_analysis.R --------------------------------------------------------

# Author: sclancy / vgouirand 
# Description: differential expression analysis of bulk-RNAseq data
# Input: here::here("input", "outs") 
# Output: here::here("output", "hTreg_counts.csv")
# Date: 2023_02_27

# Library Import ----------------------------------------------------------

library(conflicted)
library(AnnotationDbi)
library(biomaRt)
library(edgeR)
library(fgsea)
library(GenomicFeatures)
library(here)
library(limma)
library(msigdbr)
library(readxl)
library(tidyverse)
library(tximeta)
library(tximport)
library(VennDiagram)

options(width = 80)

filter <- dplyr::filter

# Metadata  ---------------------------------------------------------------
hTreg_metadata <- read_xlsx(path = here("input", "sample annotations.xlsx"))

hTreg_metadata <- 
  hTreg_metadata %>% 
  mutate(
    donor = recode(
      donor, 
      `Del 106` = "P1",
      `Patient A` = "P2",
      `Patient B` = "P3",
      LD = "P4"
    ),
    CRISPR = factor(
      x = recode(
        CRISPR,
        CTRL = "sgCTRL",
        LAYN = "sgLAYN"
      ), 
      levels = c(
        "sgLAYN", 
        "sgCTRL"
      )
    ), 
    coating = factor(
      x = coating,
      levels = c(
        "HA",
        "noHA"
      )
    ),
    group = factor(
      x = paste0(CRISPR, "_", coating),
      levels = c(
        "sgLAYN_HA",
        "sgLAYN_noHA", 
        "sgCTRL_HA", 
        "sgCTRL_noHA"
      )
    ),
    sex = factor(
      x = ifelse(donor == "P4", "F", "M"),
      levels = c("F", "M")
    ),
    ID = paste0(donor, "_", CRISPR, "_", coating)
  )



# Gene References ---------------------------------------------------------

GRCh38 <- makeTxDbFromGFF(
  file = here("input", "Homo_sapiens.GRCh38.109.gtf.gz"),
  format = "gtf",
  organism = "Homo sapiens"
)
# Import genomic features from the file as a GRanges object ... OK
# Prepare the 'metadata' data frame ... OK
# Make the TxDb object ... OK
# Warning message:
#   In .get_cds_IDX(mcols0$type, mcols0$phase) :
#   The "phase" metadata column contains non-NA values for features of type
# stop_codon. This information was ignored.

# tx2gene
ttg <- AnnotationDbi::select(
  x = GRCh38, 
  keys = keys(x = GRCh38, keytype = "TXNAME"),
  "GENEID", 
  "TXNAME"
)

# biomaRt
# 20230228
biomaRt::listEnsembl()
#         biomart                version
# 1         genes      Ensembl Genes 109
# 2 mouse_strains      Mouse strains 109
# 3          snps  Ensembl Variation 109
# 4    regulation Ensembl Regulation 109

ensembl <- useEnsembl(
  biomart = 'genes', 
  dataset = 'hsapiens_gene_ensembl', 
  version = 109
)

ensembl <- read_rds(file = here("input", "hsapiens_gene_ensembl_109_mart.rds"))

key <- getBM(
  attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"),
  mart = ensembl
)

# Import Abundances -------------------------------------------------------

hTreg_abundances <- 
  list.files(
    path = here("input", "outs"),
    full.names = TRUE,
    recursive = TRUE
  ) %>%
  grep(
    pattern = "/abundance.tsv$",
    value = TRUE
  ) %>% 
  setNames(nm = hTreg_metadata$ID)



# TxImport ----------------------------------------------------------------

txi <- tximport(
  files = hTreg_abundances, 
  tx2gene = ttg, 
  type = "kallisto", 
  ignoreTxVersion = TRUE, # ignores "." in target_id 
  txIn = TRUE, # transcript input (ENST)
  txOut = FALSE, # summarize transcript-level estimates to gene estimates   
  countsFromAbundance = "lengthScaledTPM"
)


# Limma -------------------------------------------------------------------

y <- DGEList(
  counts = txi$counts,
  samples = hTreg_metadata,
  group = hTreg_metadata$group
  # genes = key[which(row.names(txi$counts) %in% key$ensembl_gene_id), ] # match genes in counts to gene order in key
)

# y[y$gene$chromosome_name != "Y", ]




# layn counts
y$counts[key$ensembl_gene_id[key$hgnc_symbol == "LAYN"], , drop = FALSE] %>% 
  t() %>% 
  as_tibble(rownames = "ID") %>% 
  full_join(y = y$samples, by = c("ID")) %>% 
  ggplot(aes(x = ID, y = ENSG00000204381)) + 
  geom_col(aes(fill = donor)) + 
  labs(
    y = "LAYN",
    x = NULL
  ) + 
  scale_y_continuous(expand = expansion(mult = c(0, 0.5))) + 
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
# total counts
y$counts %>% 
  colSums() %>% 
  as_tibble(rownames = "ID") %>% 
  full_join(y = y$samples, by = c("ID")) %>% 
  ggplot(aes(x = ID, y = value)) + 
  geom_col(aes(fill = donor)) + 
  labs(
    y = "Counts",
    x = NULL
  ) + 
  scale_y_continuous(expand = expansion(mult = c(0, 0.5))) + 
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# mds no batch

plotMDS(x = y, col = scales::hue_pal()(4)[factor(y$samples$donor)], cex = 0.75)
plotMDS(x = removeBatchEffect(x = y$counts, batch = y$samples$donor), col = scales::hue_pal()(4)[factor(y$samples$donor)], cex = 0.75)
plotMDS(x = removeBatchEffect(x = y$counts, batch = y$samples$donor), col = scales::hue_pal()(2)[factor(y$samples$CRISPR)], cex = 0.75)


# remove P4
y <- DGEList(
    counts = txi$counts[, !startsWith(x = colnames(txi$counts), prefix = "P4")],
    samples = hTreg_metadata$ID[hTreg_metadata$donor != "P4"],
    group = hTreg_metadata$group[hTreg_metadata$donor != "P4"]
  )

hTreg_metadata <- hTreg_metadata[hTreg_metadata$donor != "P4", ]

design <- model.matrix(~ 0 + group + donor, data = hTreg_metadata)
keep <- filterByExpr(y = y, design = design)
y <- y[keep, , keep.lib.sizes = FALSE]
y <- calcNormFactors(y)
v <- voom(y, design, plot = TRUE)
# v2 <- voomWithQualityWeights(counts = y, design = design, plot = TRUE)

contrasts <- makeContrasts(
  # sgLAYN_HA_v_sgLAYN_noHA = groupsgLAYN_HA - groupsgLAYN_noHA, # coef = 1
  # sgLAYN_HA_v_sgCTRL_HA = groupsgLAYN_HA - groupsgCTRL_HA, # coef = 2
  sgLAYN_noHA_v_sgCTRL_noHA = groupsgLAYN_noHA - groupsgCTRL_noHA, # coef = 3
  # sgCTRL_HA_v_sgCTRL_noHA = groupsgCTRL_HA - groupsgCTRL_noHA, # coef = 4
  # LAYN_v_CTRL = (groupsgLAYN_HA + groupsgLAYN_noHA) / 2 -
  #   (groupsgCTRL_HA + groupsgCTRL_noHA) / 2,  # coef = 5
  # HA_v_noHA = (groupsgLAYN_HA + groupsgCTRL_HA) / 2 -
  #   (groupsgLAYN_noHA + groupsgCTRL_noHA) / 2, # coef = 6
  levels = colnames(design)
)

lmFit <- lmFit(
  object = v,
  design = design
  # weights = v$weights
)

contrastsFit <- contrasts.fit(
  fit = lmFit,
  contrasts = contrasts
)

ebayes <- eBayes(
  fit = contrastsFit,
  trend = TRUE
)

degs <- mapply(
  FUN = function(x, y) {
    tt <- topTable(
      fit = ebayes,
      coef = y,
      adjust.method = "BH",
      sort.by = "none",
      number = Inf
    )
    tt <- rownames_to_column(tt, var = "Ensembl")
  },
  x = attributes(contrasts)$dimnames$Contrasts, # names
  y = 1:ncol(contrasts),
  SIMPLIFY = FALSE
) %>%
  bind_rows(.id = "contrast") %>%
  separate(
    col = contrast,
    into = c("group1", "group2"),
    sep = "_v_",
    remove = FALSE
  )

degs_hgnc <- left_join(
  x = degs,
  y = key, 
  by = c("Ensembl" = "ensembl_gene_id"),
  multiple = "all"
)

degs_hgnc %>% 
  ggplot(aes(x = logFC, y = -log10(adj.P.Val))) + 
  geom_point(aes(color = ifelse(test = hgnc_symbol == "LAYN", "red", "black"))) + 
  ggrepel::geom_text_repel(aes(label = ifelse(test = hgnc_symbol == "LAYN", hgnc_symbol, ""))) +
  scale_color_identity() + 
  facet_wrap(~ contrast, nrow = 1) + 
  theme_classic() + 
  theme(strip.background = element_blank(), strip.text.x = element_text(face = "bold")) + 
  coord_cartesian(clip = "off")

degs_hgnc %>% dplyr::filter(adj.P.Val < 0.05) %>% dplyr::count(contrast)
degs_hgnc %>% dplyr::filter(adj.P.Val < 0.05) %>% View()

degs_hgnc %>% 
  dplyr::filter(adj.P.Val < 0.05) %>% 
  dplyr::count(contrast) %>% 
  ggplot(aes(x = contrast, y = n)) + 
  geom_col(aes(fill = contrast)) + 
  geom_text(aes(label = n), size = 8 / ggplot2:::.pt, nudge_y = 100) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) + 
  guides(fill = guide_legend(nrow = 1)) + 
  theme_classic() + 
  labs(
    x = NULL,
    y = "No. DEGs"
  ) + 
  theme(legend.position = "none", axis.text.x = element_text(angle = 35, hjust = 1)) 

degs_hgnc %>% 
  dplyr::filter(contrast %in% "sgLAYN_noHA_v_sgCTRL_noHA", adj.P.Val < 0.05) %>% 
  group_by(contrast) %>%
  mutate(name_count = n()) %>%
  ungroup() %>% 
  filter(name_count == 2) %>% 
  dplyr::select(-name_count)

# fgsea -------------------------------------------------------------------

# all  
msigdb <- msigdbr::msigdbr(species = "Homo sapiens")

msigdb_split <- msigdb %>% split(x = .$gene_symbol, f = .$gs_name)

# gene onotology

go <- msigdbr::msigdbr(species = "Homo sapiens", category = "C5")
go_split <- go %>% dplyr::filter(grepl("^GO\\:", gs_subcat)) %>% split(x = .$gene_symbol, f = .$gs_name)

rank_list <- vector(mode = "list")
for (i in unique(degs_hgnc$contrast)) {
  message("Generating gene set ", i)
  rank_list[[i]] <- degs_hgnc %>% 
    filter(contrast == i) %>% 
    dplyr::filter(hgnc_symbol != "") %>% 
    dplyr::select(hgnc_symbol, logFC) %>%
    dplyr::group_by(hgnc_symbol) %>% 
    dplyr::arrange(-logFC) %>% 
    dplyr::slice_sample(n = 1) %>% 
    ungroup() %>% 
    dplyr::arrange(-logFC) %>% 
    tibble::deframe()
}

rank_list[[1]] %>% length()
# 16708
names(rank_list[[1]])[which(!isUnique(names(rank_list[[1]])))]
# [1] "NSUN5P2"   "HERC3"     "CCDC144BP" "CD99P1"    "HERC2P9"   "CCDC144BP"
# [7] "BTN2A3P"   "RRN3P3"    "POLR2J3"   "POLR2J3"   "GOLGA2P5"  "ZNF252P"  
# [13] "BTN2A3P"   "TBCE"      "RRP7BP"    "HERC2P9"   "RRN3P3"    "GOLGA2P5" 
# [19] "HERC3"     "CD99P1"    "ADCY10P1"  "NSUN5P2"   "TBCE"      "CASTOR3P" 
# [25] "ADCY10P1"  "CASTOR3P"  "ZNF252P"   "RRP7BP"  


names(rank_list)

# fgsea_go 
set.seed(42L)
fgsea_go <- mapply(
  FUN = function(x, y) {
    message("Running fgsea on ", y,  " contrast.")
    res <- fgsea::fgseaMultilevel(
      pathways = go_split,
      stats = x,
      eps = 0,
      minSize = 15,
      scoreType = "std",
      nproc = 8
    )
    return(res)
  },
  x = rank_list,
  y = names(rank_list),
  SIMPLIFY = FALSE,
  USE.NAMES = TRUE
)

fgsea_go <- bind_rows(fgsea_go, .id = "contrast")
fgsea_go <- fgsea_go %>% mutate(leadingEdge = sapply(leadingEdge, toString))